This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#Load packages
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("here")
## here() starts at C:/Users/bailey.spiegelberg/Desktop/biostat776project1
#tests if a directory named "data" exists locally
if (!dir.exists(here("data"))) {
dir.create(here("data"))
}
# saves data only once (not each time you knit a R Markdown)
if (!file.exists(here("data", "chocolate.RDS"))) {
url_csv <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv"
chocolate <- readr::read_csv(url_csv)
# save the file to RDS objects
saveRDS(chocolate, file = here("data", "chocolate.RDS"))
}
chocolate <- readRDS(here("data", "chocolate.RDS"))
as_tibble(chocolate)
## # A tibble: 2,530 × 10
## ref company_manufacturer company_location review_date
## <dbl> <chr> <chr> <dbl>
## 1 2454 5150 U.S.A. 2019
## 2 2458 5150 U.S.A. 2019
## 3 2454 5150 U.S.A. 2019
## 4 2542 5150 U.S.A. 2021
## 5 2546 5150 U.S.A. 2021
## 6 2546 5150 U.S.A. 2021
## 7 2542 5150 U.S.A. 2021
## 8 797 A. Morin France 2012
## 9 797 A. Morin France 2012
## 10 1011 A. Morin France 2013
## # ℹ 2,520 more rows
## # ℹ 6 more variables: country_of_bean_origin <chr>,
## # specific_bean_origin_or_bar_name <chr>, cocoa_percent <chr>,
## # ingredients <chr>, most_memorable_characteristics <chr>, rating <dbl>
glimpse(chocolate)
## Rows: 2,530
## Columns: 10
## $ ref <dbl> 2454, 2458, 2454, 2542, 2546, 2546, 2…
## $ company_manufacturer <chr> "5150", "5150", "5150", "5150", "5150…
## $ company_location <chr> "U.S.A.", "U.S.A.", "U.S.A.", "U.S.A.…
## $ review_date <dbl> 2019, 2019, 2019, 2021, 2021, 2021, 2…
## $ country_of_bean_origin <chr> "Tanzania", "Dominican Republic", "Ma…
## $ specific_bean_origin_or_bar_name <chr> "Kokoa Kamili, batch 1", "Zorzal, bat…
## $ cocoa_percent <chr> "76%", "76%", "76%", "68%", "72%", "8…
## $ ingredients <chr> "3- B,S,C", "3- B,S,C", "3- B,S,C", "…
## $ most_memorable_characteristics <chr> "rich cocoa, fatty, bready", "cocoa, …
## $ rating <dbl> 3.25, 3.50, 3.75, 3.00, 3.00, 3.25, 3…
library(dplyr)
library(ggplot2)
Changing the the number of bins affects how many bars in the histogram will be present. For example, with the default (30 bins) there are 12 bars but with (bins = 10), there are 9 bars. I chose (bins = 15) as I think it gives us the best distribution.
ggplot(chocolate, aes(x=rating)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(chocolate, aes(x=rating)) + geom_histogram(bins = 10)
ggplot(chocolate, aes(x=rating)) + geom_histogram(bins = 15)
ggplot(chocolate, aes(x=rating)) + geom_histogram(bins = 20)
ggplot(chocolate, aes(x=rating)) + geom_histogram(bins = 25)
library(ThemePark)
ggplot(chocolate, aes(x=rating)) +
geom_histogram (bins = 15) +
labs(title = "Rating Distribution", x = "Rating", y = "Count") +
theme_dune()
countrybeanreview = chocolate %>%
group_by(country_of_bean_origin) %>%
summarise (n())
print(countrybeanreview)
## # A tibble: 62 × 2
## country_of_bean_origin `n()`
## <chr> <int>
## 1 Australia 3
## 2 Belize 76
## 3 Blend 156
## 4 Bolivia 80
## 5 Brazil 78
## 6 Burma 1
## 7 Cameroon 3
## 8 China 1
## 9 Colombia 79
## 10 Congo 11
## # ℹ 52 more rows
Ecuador = chocolate %>%
group_by(country_of_bean_origin =="Ecuador") %>%
summarise (mean = mean(rating), total = n(), sd = sd(rating))
print (Ecuador)
## # A tibble: 2 × 4
## `country_of_bean_origin == "Ecuador"` mean total sd
## <lgl> <dbl> <int> <dbl>
## 1 FALSE 3.20 2311 0.438
## 2 TRUE 3.16 219 0.512
The companies with the highest ratings are Amano, Benoit Nihant, Beschle, Domori, Durci, Pacari, and The Smooth Chocolator
bestcompany = chocolate %>%
filter(country_of_bean_origin =="Ecuador" & rating == 4) %>%
group_by(company_manufacturer)
print (bestcompany)
## # A tibble: 8 × 10
## # Groups: company_manufacturer [7]
## ref company_manufacturer company_location review_date country_of_bean_origin
## <dbl> <chr> <chr> <dbl> <chr>
## 1 470 Amano U.S.A. 2010 Ecuador
## 2 1141 Benoit Nihant Belgium 2013 Ecuador
## 3 636 Beschle (Felchlin) Switzerland 2011 Ecuador
## 4 192 Domori Italy 2007 Ecuador
## 5 1630 Durci U.S.A. 2015 Ecuador
## 6 1415 Pacari Ecuador 2014 Ecuador
## 7 1622 Smooth Chocolator, … Australia 2015 Ecuador
## 8 1740 Smooth Chocolator, … Australia 2016 Ecuador
## # ℹ 5 more variables: specific_bean_origin_or_bar_name <chr>,
## # cocoa_percent <chr>, ingredients <chr>,
## # most_memorable_characteristics <chr>, rating <dbl>
Top countries with highest average ratings are Tobago, China and Sao Tome & Principe.
countryaverage = chocolate %>%
group_by(country_of_bean_origin) %>%
summarise (mean = mean(rating)) %>%
arrange(desc(mean))
print (countryaverage)
## # A tibble: 62 × 2
## country_of_bean_origin mean
## <chr> <dbl>
## 1 Tobago 3.62
## 2 China 3.5
## 3 Sao Tome & Principe 3.5
## 4 Solomon Islands 3.45
## 5 Congo 3.32
## 6 Thailand 3.3
## 7 Cuba 3.29
## 8 Vietnam 3.29
## 9 Papua New Guinea 3.28
## 10 Madagascar 3.27
## # ℹ 52 more rows
Now top countries with highest average ratings are Solomon Islands, Congo and Cuba.
morethan10 = chocolate %>%
group_by(country_of_bean_origin) %>%
filter(n()>=10) %>%
summarise (mean = mean(rating)) %>%
arrange(desc(mean))
print (morethan10)
## # A tibble: 35 × 2
## country_of_bean_origin mean
## <chr> <dbl>
## 1 Solomon Islands 3.45
## 2 Congo 3.32
## 3 Cuba 3.29
## 4 Vietnam 3.29
## 5 Papua New Guinea 3.28
## 6 Madagascar 3.27
## 7 Haiti 3.27
## 8 Brazil 3.26
## 9 Guatemala 3.26
## 10 Nicaragua 3.26
## # ℹ 25 more rows
library(lubridate)
library(tidyr)
#7.1 Identify the countries of bean origin with at least 50 reviews. Remove reviews from countries are not in this list.
atleast50 = chocolate %>%
group_by(country_of_bean_origin) %>%
summarise(reviews = n()) %>%
filter(reviews >=50)
print (atleast50)
## # A tibble: 16 × 2
## country_of_bean_origin reviews
## <chr> <int>
## 1 Belize 76
## 2 Blend 156
## 3 Bolivia 80
## 4 Brazil 78
## 5 Colombia 79
## 6 Dominican Republic 226
## 7 Ecuador 219
## 8 Guatemala 62
## 9 Madagascar 177
## 10 Mexico 55
## 11 Nicaragua 100
## 12 Papua New Guinea 50
## 13 Peru 244
## 14 Tanzania 79
## 15 Venezuela 253
## 16 Vietnam 73
#7.2 Using the variable describing the chocolate percentage for each review, create a new column that groups chocolate percentages into one of four groups: (i) <60%, (ii) >=60 to <70%, (iii) >=70 to <90%, and (iii) >=90%
atleast50 = chocolate %>%
group_by(country_of_bean_origin) %>%
mutate(cocoa_percent_group = case_when(
cocoa_percent < 60 ~ "i",
cocoa_percent >= 60 & cocoa_percent < 70 ~ "ii",
cocoa_percent >= 70 & cocoa_percent < 90 ~ "iii",
cocoa_percent >= 90 ~ "iv" )) %>%
arrange(cocoa_percent_group) %>%
summarise(reviews = n(), cocoa_percent_group = cocoa_percent_group ) %>%
filter(reviews >=50)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'country_of_bean_origin'. You can override
## using the `.groups` argument.
print(atleast50)
## # A tibble: 2,007 × 3
## # Groups: country_of_bean_origin [16]
## country_of_bean_origin reviews cocoa_percent_group
## <chr> <int> <chr>
## 1 Belize 76 ii
## 2 Belize 76 ii
## 3 Belize 76 ii
## 4 Belize 76 ii
## 5 Belize 76 ii
## 6 Belize 76 iii
## 7 Belize 76 iii
## 8 Belize 76 iii
## 9 Belize 76 iii
## 10 Belize 76 iii
## # ℹ 1,997 more rows
#3. Using the new column described in #2, re-order the factor levels (if needed) to be starting with the smallest percentage group and increasing to the largest percentage group
atleast50 = chocolate %>%
group_by(country_of_bean_origin) %>%
mutate(cocoa_percent_group = case_when(
cocoa_percent < 60 ~ "i",
cocoa_percent >= 60 & cocoa_percent < 70 ~ "ii",
cocoa_percent >= 70 & cocoa_percent < 90 ~ "iii",
cocoa_percent >= 90 ~ "iv" )) %>%
summarise(reviews = n(), cocoa_percent_group = cocoa_percent_group ) %>%
filter(reviews >=50) %>%
arrange(cocoa_percent_group)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'country_of_bean_origin'. You can override
## using the `.groups` argument.
print(atleast50)
## # A tibble: 2,007 × 3
## # Groups: country_of_bean_origin [16]
## country_of_bean_origin reviews cocoa_percent_group
## <chr> <int> <chr>
## 1 Blend 156 i
## 2 Blend 156 i
## 3 Blend 156 i
## 4 Blend 156 i
## 5 Blend 156 i
## 6 Blend 156 i
## 7 Blend 156 i
## 8 Blend 156 i
## 9 Blend 156 i
## 10 Blend 156 i
## # ℹ 1,997 more rows
#7.4 For each country, make a set of four side-by-side boxplots plotting the groups on the x-axis and the ratings on the y-axis. These plots should be faceted by country.
g = chocolate %>%
group_by(country_of_bean_origin) %>%
mutate(cocoa_percent_group = case_when(
cocoa_percent < 60 ~ "i",
cocoa_percent >= 60 & cocoa_percent < 70 ~ "ii",
cocoa_percent >= 70 & cocoa_percent < 90 ~ "iii",
cocoa_percent >= 90 ~ "iv" )) %>%
reframe(reviews = n(), cocoa_percent_group = cocoa_percent_group, rating = rating ) %>%
filter(reviews >=50) %>%
ggplot(aes(x = cocoa_percent_group, y = rating)) +
geom_boxplot (color = "deeppink") +
facet_wrap(~ country_of_bean_origin) +
labs(title = "Rating Of Cocoa Percent by Country", x = "cocoa percent group", y = "Rating") +
theme_barbie(plot.title = element_text(size=20, hjust = .5))
print (g)
On average, category ii is barely higher than category iii for highest ratings. Countries are split between category ii and iii on being the highest rated.
#Part 2: Join two datasets together
library(gapminder)
head(gapminder)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
#2.1 Use this dataset it to create a new column called continent in our chocolate dataset1 that contains the continent name for each review where the country of bean origin is.
chocolate =readRDS(here("data", "chocolate.RDS"))
Chocolatewithcontinents = right_join(chocolate, gapminder,
by = c("country_of_bean_origin" = "country"),
relationship = "many-to-many")
print(Chocolatewithcontinents)
## # A tibble: 25,944 × 15
## ref company_manufacturer company_location review_date
## <dbl> <chr> <chr> <dbl>
## 1 2454 5150 U.S.A. 2019
## 2 2454 5150 U.S.A. 2019
## 3 2454 5150 U.S.A. 2019
## 4 2454 5150 U.S.A. 2019
## 5 2454 5150 U.S.A. 2019
## 6 2454 5150 U.S.A. 2019
## 7 2454 5150 U.S.A. 2019
## 8 2454 5150 U.S.A. 2019
## 9 2454 5150 U.S.A. 2019
## 10 2454 5150 U.S.A. 2019
## # ℹ 25,934 more rows
## # ℹ 11 more variables: country_of_bean_origin <chr>,
## # specific_bean_origin_or_bar_name <chr>, cocoa_percent <chr>,
## # ingredients <chr>, most_memorable_characteristics <chr>, rating <dbl>,
## # continent <fct>, year <int>, lifeExp <dbl>, pop <int>, gdpPercap <dbl>
#2.2 Only keep reviews that have reviews from countries of bean origin with at least 10 reviews.
morethan10 = Chocolatewithcontinents %>%
group_by(country_of_bean_origin) %>%
summarise (reviews = n()) %>%
filter(n()>=10) %>%
arrange(desc(n()))
print (morethan10)
## # A tibble: 142 × 2
## country_of_bean_origin reviews
## <chr> <int>
## 1 Afghanistan 12
## 2 Albania 12
## 3 Algeria 12
## 4 Angola 12
## 5 Argentina 12
## 6 Australia 36
## 7 Austria 12
## 8 Bahrain 12
## 9 Bangladesh 12
## 10 Belgium 12
## # ℹ 132 more rows
#2.3 Also, remove the country of bean origin named "Blend".
Chocolatewithcontinents = Chocolatewithcontinents %>%
filter(country_of_bean_origin!="Blend")
print(Chocolatewithcontinents)
## # A tibble: 25,944 × 15
## ref company_manufacturer company_location review_date
## <dbl> <chr> <chr> <dbl>
## 1 2454 5150 U.S.A. 2019
## 2 2454 5150 U.S.A. 2019
## 3 2454 5150 U.S.A. 2019
## 4 2454 5150 U.S.A. 2019
## 5 2454 5150 U.S.A. 2019
## 6 2454 5150 U.S.A. 2019
## 7 2454 5150 U.S.A. 2019
## 8 2454 5150 U.S.A. 2019
## 9 2454 5150 U.S.A. 2019
## 10 2454 5150 U.S.A. 2019
## # ℹ 25,934 more rows
## # ℹ 11 more variables: country_of_bean_origin <chr>,
## # specific_bean_origin_or_bar_name <chr>, cocoa_percent <chr>,
## # ingredients <chr>, most_memorable_characteristics <chr>, rating <dbl>,
## # continent <fct>, year <int>, lifeExp <dbl>, pop <int>, gdpPercap <dbl>
NAcontinents = Chocolatewithcontinents %>%
filter(continent = NA)
print (NAcontinents)
## # A tibble: 0 × 15
## # ℹ 15 variables: ref <dbl>, company_manufacturer <chr>,
## # company_location <chr>, review_date <dbl>, country_of_bean_origin <chr>,
## # specific_bean_origin_or_bar_name <chr>, cocoa_percent <chr>,
## # ingredients <chr>, most_memorable_characteristics <chr>, rating <dbl>,
## # continent <fct>, year <int>, lifeExp <dbl>, pop <int>, gdpPercap <dbl>
violin = ggplot(Chocolatewithcontinents, aes(x = continent, y = rating)) +
geom_violin() +
labs (x = "Continents", y = "Ratings")
print (violin)
## Warning: Removed 1248 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
#Part 3 Convert wide data into long data
chocolate =readRDS(here("data", "chocolate.RDS"))
#3.1 Create a new set of columns titled beans, sugar, cocoa_butter, vanilla, letchin, and salt that contain a 1 or 0 representing whether or not that review for the chocolate bar contained that ingredient (1) or not (0).
ingredients_chocolate = chocolate %>%
select(ingredients) %>%
mutate(
beans = as.integer(grepl("B", ingredients)),
sugar = as.integer(grepl("S", ingredients)),
cocoa_butter = as.integer(grepl ("C", ingredients)),
vanilla = as.integer(grepl("V", ingredients)),
letchin = as.integer(grepl("L", ingredients)),
salt = as.integer(grepl("SA", ingredients)),
)
print(ingredients_chocolate)
## # A tibble: 2,530 × 7
## ingredients beans sugar cocoa_butter vanilla letchin salt
## <chr> <int> <int> <int> <int> <int> <int>
## 1 3- B,S,C 1 1 1 0 0 0
## 2 3- B,S,C 1 1 1 0 0 0
## 3 3- B,S,C 1 1 1 0 0 0
## 4 3- B,S,C 1 1 1 0 0 0
## 5 3- B,S,C 1 1 1 0 0 0
## 6 3- B,S,C 1 1 1 0 0 0
## 7 3- B,S,C 1 1 1 0 0 0
## 8 4- B,S,C,L 1 1 1 0 1 0
## 9 4- B,S,C,L 1 1 1 0 1 0
## 10 4- B,S,C,L 1 1 1 0 1 0
## # ℹ 2,520 more rows
#3.2 Create a new set of columns titled char_cocoa, char_sweet, char_nutty, char_creamy, char_roasty, char_earthy that contain a 1 or 0 representing whether or not that the most memorable characteristic for the chocolate bar had that word (1) or not (0).
new_chocolate = chocolate %>%
select(ingredients, most_memorable_characteristics) %>%
mutate(
char_cocoa = as.integer(grepl("cocoa", most_memorable_characteristics)),
char_sweet = as.integer(grepl("sweet", most_memorable_characteristics)),
char_nutty = as.integer(grepl ("nutty", most_memorable_characteristics)),
char_creamy = as.integer(grepl("creamy", most_memorable_characteristics)),
char_roasty = as.integer(grepl("roasty", most_memorable_characteristics)),
char_earthy = as.integer(grepl("earthy", most_memorable_characteristics)),
)
print(new_chocolate)
## # A tibble: 2,530 × 8
## ingredients most_memorable_characteristics char_cocoa char_sweet char_nutty
## <chr> <chr> <int> <int> <int>
## 1 3- B,S,C rich cocoa, fatty, bready 1 0 0
## 2 3- B,S,C cocoa, vegetal, savory 1 0 0
## 3 3- B,S,C cocoa, blackberry, full body 1 0 0
## 4 3- B,S,C chewy, off, rubbery 0 0 0
## 5 3- B,S,C fatty, earthy, moss, nutty,chal… 0 0 1
## 6 3- B,S,C mildly bitter, basic cocoa, fat… 1 0 0
## 7 3- B,S,C milk brownie, macadamia,chewy 0 0 0
## 8 4- B,S,C,L vegetal, nutty 0 0 1
## 9 4- B,S,C,L fruity, melon, roasty 0 0 0
## 10 4- B,S,C,L brief fruit note, earthy, nutty 0 0 1
## # ℹ 2,520 more rows
## # ℹ 3 more variables: char_creamy <int>, char_roasty <int>, char_earthy <int>
all_chocolate = chocolate %>%
select(ingredients, most_memorable_characteristics, review_date) %>%
mutate(
beans = as.integer(grepl("B", ingredients)),
sugar = as.integer(grepl("S", ingredients)),
cocoa_butter = as.integer(grepl ("C", ingredients)),
vanilla = as.integer(grepl("V", ingredients)),
letchin = as.integer(grepl("L", ingredients)),
salt = as.integer(grepl("SA", ingredients)),
char_cocoa = as.integer(grepl("cocoa", most_memorable_characteristics)),
char_sweet = as.integer(grepl("sweet", most_memorable_characteristics)),
char_nutty = as.integer(grepl ("nutty", most_memorable_characteristics)),
char_creamy = as.integer(grepl("creamy", most_memorable_characteristics)),
char_roasty = as.integer(grepl("roasty", most_memorable_characteristics)),
char_earthy = as.integer(grepl("earthy", most_memorable_characteristics)),
)
print (all_chocolate)
## # A tibble: 2,530 × 15
## ingredients most_memorable_characteris…¹ review_date beans sugar cocoa_butter
## <chr> <chr> <dbl> <int> <int> <int>
## 1 3- B,S,C rich cocoa, fatty, bready 2019 1 1 1
## 2 3- B,S,C cocoa, vegetal, savory 2019 1 1 1
## 3 3- B,S,C cocoa, blackberry, full body 2019 1 1 1
## 4 3- B,S,C chewy, off, rubbery 2021 1 1 1
## 5 3- B,S,C fatty, earthy, moss, nutty,… 2021 1 1 1
## 6 3- B,S,C mildly bitter, basic cocoa,… 2021 1 1 1
## 7 3- B,S,C milk brownie, macadamia,che… 2021 1 1 1
## 8 4- B,S,C,L vegetal, nutty 2012 1 1 1
## 9 4- B,S,C,L fruity, melon, roasty 2012 1 1 1
## 10 4- B,S,C,L brief fruit note, earthy, n… 2013 1 1 1
## # ℹ 2,520 more rows
## # ℹ abbreviated name: ¹​most_memorable_characteristics
## # ℹ 9 more variables: vanilla <int>, letchin <int>, salt <int>,
## # char_cocoa <int>, char_sweet <int>, char_nutty <int>, char_creamy <int>,
## # char_roasty <int>, char_earthy <int>
#3.3 For each year (i.e. review_date), calculate the mean value in each new column you created across all reviews for that year.
mean_value = all_chocolate %>%
group_by(review_date) %>%
summarise(
mean_beans = mean(beans, na.rm = TRUE),
mean_sugar = mean(sugar, na.rm = TRUE),
mean_cocoa_butter = mean(cocoa_butter, na.rm = TRUE),
mean_vanilla = mean(vanilla, na.rm = TRUE),
mean_letchin = mean(letchin, na.rm = TRUE),
mean_salt = mean(salt, na.rm = TRUE),
mean_char_cocoa = mean(char_cocoa, na.rm = TRUE),
mean_char_sweet = mean(char_sweet, na.rm = TRUE),
mean_char_nutty = mean(char_nutty, na.rm = TRUE),
mean_char_creamy = mean(char_creamy, na.rm = TRUE),
mean_char_roasty = mean(char_roasty, na.rm = TRUE),
mean_char_earthy = mean(char_earthy, na.rm = TRUE),
)
str(mean_value)
## tibble [16 × 13] (S3: tbl_df/tbl/data.frame)
## $ review_date : num [1:16] 2006 2007 2008 2009 2010 ...
## $ mean_beans : num [1:16] 0.968 0.945 0.913 0.919 0.855 ...
## $ mean_sugar : num [1:16] 0.968 0.945 0.902 0.919 0.855 ...
## $ mean_cocoa_butter: num [1:16] 0.903 0.767 0.75 0.772 0.709 ...
## $ mean_vanilla : num [1:16] 0.694 0.548 0.359 0.325 0.227 ...
## $ mean_letchin : num [1:16] 0.694 0.384 0.511 0.341 0.391 ...
## $ mean_salt : num [1:16] 0 0 0 0 0 0 0 0 0 0 ...
## $ mean_char_cocoa : num [1:16] 0.21 0.342 0.109 0.146 0.218 ...
## $ mean_char_sweet : num [1:16] 0.1613 0.0959 0.1304 0.1545 0.1 ...
## $ mean_char_nutty : num [1:16] 0.0323 0.0411 0.1522 0.1545 0.1455 ...
## $ mean_char_creamy : num [1:16] 0.2419 0.2329 0.0978 0.0894 0.0909 ...
## $ mean_char_roasty : num [1:16] 0.0484 0.0137 0.0435 0.0813 0.0364 ...
## $ mean_char_earthy : num [1:16] 0.0645 0.0685 0.0435 0.0732 0.0727 ...
#3.4 Convert this wide dataset into a long dataset with a new feature and mean_score column.
long_mean = pivot_longer(
mean_value,
cols = starts_with("mean_"),
names_to = "features",
values_to = "mean_score"
)
print(long_mean)
## # A tibble: 192 × 3
## review_date features mean_score
## <dbl> <chr> <dbl>
## 1 2006 mean_beans 0.968
## 2 2006 mean_sugar 0.968
## 3 2006 mean_cocoa_butter 0.903
## 4 2006 mean_vanilla 0.694
## 5 2006 mean_letchin 0.694
## 6 2006 mean_salt 0
## 7 2006 mean_char_cocoa 0.210
## 8 2006 mean_char_sweet 0.161
## 9 2006 mean_char_nutty 0.0323
## 10 2006 mean_char_creamy 0.242
## # ℹ 182 more rows
#Part 4: Data visualization
scatter_plot = long_mean %>%
ggplot(aes(x = review_date, y = mean_score)) +
geom_point (color = "white") +
geom_smooth(color = "gold") +
facet_wrap(~features) +
labs(title = "Scatter plot of mean scores over time, subsetted by ingredients and characteristics", subtitle = "percentage of letchin and vanilla in chocolates have decreased over time, while mention of a cocoa charactistic has increased", caption = "Bailey Spiegelberg", x = "Time (review date)", y = "mean score") +
theme_alien(plot.title = element_text(size=10),
plot.subtitle = element_text(size=6),
plot.caption = element_text(size=8))
print (scatter_plot)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#Part 5: Make the worst plot you can!
badplot = chocolate %>%
ggplot(aes(
x = as.factor(review_date),
y = rating
)) +
geom_point(color = "yellow",
size = .5,
shape = 11,
fill = "red",
linewidth = 10) +
theme_asteroid_city(axis.text.x = element_text(size =15))
## Warning in geom_point(color = "yellow", size = 0.5, shape = 11, fill = "red", :
## Ignoring unknown parameters: `linewidth`
print(badplot)
#Part 6: Make my plot a better plot!
chocolate =readRDS(here("data", "chocolate.RDS"))
library("plotly")
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot = chocolate %>%
ggplot(aes(
x = as.factor(review_date),
y = rating,
fill = review_date
)) +
geom_violin(color = "green") +
labs(title = "Ratings over the Years", x = "Review Date", y = "Ratings", fill = "Review Date") +
theme_alien(plot.title = element_text(size=16, hjust = .5),
plot.caption = element_text(size=8),
axis.text.x = element_text(size =7))
plotly::ggplotly((plot))